Introduction

This document provides an analysis using a one-compartment toxicokinetic model. The dataset used comes from Gammarus pulex exposed to Malathion, as reported by Ashauer (2010). The analysis explores three perspectives: using predefined parameters, performing maximum likelihood estimation (MLE), and incorporating Bayesian approaches to account for parameter uncertainty.

Notes

  • The dataset used is from Ashauer (2010). The sampling_date column does not directly correspond to replicate in the dataset used by Sandrine.
  • We use the limit of quantification (LOQ) divided by 2 for below-detection data, which is a common but not entirely correct approach.
  • The priors are based on Hendriks et al. (2001).

Import Data

We load the dataset containing measurements of internal concentrations in Gammarus pulex over time. The dataset is visualized to understand the time course of internal concentrations. Unfortunately, we do not know which points belong to the same replicate.

df <- read.table("data/data_Malathion_Ashauer_2010.csv", header = TRUE, sep = ",")
df$sampling_date <- as.factor(df$sampling_date)

ggplot(data = df, aes(x = time, y = Cinternal)) +
    geom_point(size=5, alpha=0.7) +
    xlab("Time (days)") +
    ylab("Internal measured concentration (picomol/g wet weight)") +
    geom_vline(xintercept = 1, linetype="dashed")

We also define the limit of quantification (LOQ).

## Limit of detection nmol/kg(wet weight)
## from Table 5 in SI of Ashauer et al. (2010)
LOQ <- 6

Define Deterministic Model

We define a deterministic toxicokinetic model with two parameters: uptake rate (\(k_u\)) and elimination rate (\(k_e\)). This model predicts the internal concentration based on the exposure concentration in water and the simulation times. Note that the implementation assumes that tsim is ordered.

bioacc <- function(parameters, C_water, tc, tsim){
    k.u <- parameters[1]
    k.e <- parameters[2]

    tacc <- tsim[tsim <= tc]
    Cacc <- k.u*C_water*(1 - exp(-k.e*tacc))/k.e

    tdep <- tsim[tsim > tc]
    Cdep <- k.u*C_water*(exp(k.e*(tc - tdep)) - exp(-k.e*tdep))/k.e

    result <- data.frame(time = c(tacc, tdep),
                         conc = c(Cacc, Cdep))
    return(result)
}
C_water <- unique(df$Cwater)     # exposure concentration
tc <- 1                          # time until exposure ends [days]

Helper Function

This function computes the 5%, 50%, and 95% intervals for each point in time using Monte Carlo simulations of the model output.

compute.intervals <- function(X, tsim){
    intervals <-  t(apply(X, 2, function(x){quantile(x, probs=c(0.05, 0.5, 0.95))}))
    intervals <- as.data.frame(intervals)
    colnames(intervals) <- c("p05", "p50", "p95")
    intervals$time <- tsim
    intervals
}

Perspective 1: Predefined Parameters

Using predefined parameters based on Hendriks et al. (2001), we run the model to predict internal concentrations and compare them to the observed data. We also sample parameter uncertainty and plot the 90% prediction interval.

parameters.P1 <- c(k.u = 112.798, k.e = 0.00296)

tsim <- seq(0, 7.5, length=111)
simu.mean <- bioacc(parameters.P1, C_water, tc, tsim)

ggplot(simu.mean, aes(time, conc)) +
    geom_line(col = "orange", linewidth = 1.5) +
    xlab("Time (hours)") +
    ylab("Internal predicted concentrations") +
    geom_vline(xintercept = 1, linetype="dashed") +
    geom_point(data = df, aes(time, Cinternal)) +
    ggtitle("Perspective 1")

n <- 10000
X.para <- matrix(NA, ncol=length(tsim), nrow=n)
for(i in 1:n){
    meanlog.k.u = log10(parameters.P1['k.u']) / log10(exp(1))
    meanlog.k.e = log10(parameters.P1['k.e']) / log10(exp(1))
    sdlog = log10(10) / log10(exp(1))
    para <- c(rlnorm(1, meanlog.k.u, sdlog),
              rlnorm(1, meanlog.k.e, sdlog))

    X.para[i,] <- bioacc(para, C_water, tc, tsim)$conc
}

intervals.para <- compute.intervals(X.para, tsim)

ggplot(intervals.para) +
    geom_line(aes(x=time, y=p50), col = "orange", linewidth = 1.5) +
    geom_ribbon(aes(x=time, y=p50, ymin=p05, ymax=p95), alpha = 0.2) +
    xlab("Time (hours)") +
    ylab("Internal concentration") +
    geom_vline(xintercept = 1, linetype="dashed") +
    geom_point(data = df, aes(time, Cinternal)) +
    ggtitle("Perspective 1 - 90% parameter uncertainty interval")

Perspective 2: Maximum Likelihood Estimation (MLE)

Normally distributed errors

We estimate the parameters by maximizing the likelihood of the observed data, assuming normal- and gamma-distributed errors. We then fit the model using these MLE parameters.

log.ll.normal <- function(theta, data, LOQ=6) {
    conc.pred <- bioacc(theta, unique(data$Cwater), tc = 1, tsim = data$time)$conc
    obs <- ifelse(data$Cinternal==0, LOQ/2, data$Cinternal)
    sum(dnorm(obs, mean=conc.pred, sd=exp(theta[3]), log=TRUE))
}

mle <- maxLik::maxLik(log.ll.normal,
                      start = c(k.u=112, k.e=0.003, log.sd=log(1)),
                      data = data.boot, LOQ = LOQ,
                      method = "NM")
summary(mle)
## --------------------------------------------
## Maximum Likelihood estimation
## Nelder-Mead maximization, 124 iterations
## Return code 0: successful convergence 
## Log-Likelihood: -309.6196 
## 3  free parameters
## Estimates:
##         Estimate Std. error t value Pr(> t)    
## k.u    112.94132    2.91237   38.78  <2e-16 ***
## k.e      1.14437    0.08087   14.15  <2e-16 ***
## log.sd   3.00441    0.08452   35.55  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## --------------------------------------------
parameters.P2 <- mle$estimate
simu.mean <- bioacc(parameters.P2, C_water, tc, tsim)

ggplot(simu.mean, aes(time, conc)) +
    geom_line(col = "orange", linewidth = 1.5) +
    xlab("Time (hours)") +
    ylab("Internal predicted concentrations") +
    geom_vline(xintercept = 1, linetype="dashed") +
    geom_point(data = df, aes(time, Cinternal)) +
    ggtitle("MLE fit for normal errors")

Gamma-distributed errors

We repeat the above process for gamma-distributed errors.

log.ll.gamma <- function(theta, data, LOQ=6) {
    mode <- bioacc(theta, unique(data$Cwater), tc = 1, tsim = data$time)$conc
    sd <- exp(theta[3])
    scale <- 0.5*(sqrt(4*sd^2+mode^2) - mode)
    shape <- mode/scale + 1
    obs <- ifelse(data$Cinternal==0, LOQ/2, data$Cinternal)
    sum(dgamma(obs, shape=shape, scale=scale, log=TRUE))
}

mle.g <- maxLik::maxLik(log.ll.gamma,
                        start = c(k.u=112, k.e=0.003, log.sd=log(1)),
                        data = data.boot, LOQ = LOQ,
                        method= "NM")
summary(mle.g)
## --------------------------------------------
## Maximum Likelihood estimation
## Nelder-Mead maximization, 170 iterations
## Return code 0: successful convergence 
## Log-Likelihood: -300.9527 
## 3  free parameters
## Estimates:
##         Estimate Std. error t value Pr(> t)    
## k.u    109.41929    2.07022   52.85  <2e-16 ***
## k.e      1.23121    0.07784   15.82  <2e-16 ***
## log.sd   2.97415    0.09497   31.32  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## --------------------------------------------
parameters.P2.g <- mle$estimate
simu.mean.g <- bioacc(parameters.P2.g, C_water, tc, tsim)

ggplot(simu.mean.g, aes(time, conc)) +
    geom_line(col = "orange", linewidth = 1.5) +
    xlab("Time (hours)") +
    ylab("Internal predicted concentrations") +
    geom_vline(xintercept = 1, linetype="dashed") +
    geom_point(data = df, aes(time, Cinternal)) +
    ggtitle("MLE fit for gamma errors")

Bootstrapping

Maximum likelihood estimation computes confidence intervals for the parameters based on large-sample-size approximations. Bootstrapping avoids these approximations and also offers a straightforward way to propagate parameter uncertainty through the model to compute confidence intervals for predictions.

Normal distributed errors

# run non-parametric bootstrap

M <- 3000
thetas.boot <- matrix(NA, ncol=3, nrow=M)
colnames(thetas.boot) <- c("k.u", "k.e", "log.sd")
for(i in 1:M){
    idx <- sample(1:nrow(df), nrow(df), replace=TRUE)
    data.boot <- df[idx,]
    data.boot <- data.boot[order(data.boot$time),]
    fit <- maxLik::maxNM(log.ll.normal,
                         start = parameters.P2,
                         data = data.boot, LOQ = LOQ,
                         parscale = c(100, 1, 5))

    if(fit$code <= 2){ # check for convergence
        thetas.boot[i,] <- coef(fit)
    }
}

pairs(thetas.boot, col=gray(0, alpha=0.1))

## propagate parameters through models
X.para <- X.total <- matrix(NA, ncol=length(tsim), nrow=M)
for(i in 1:M){
    para <- thetas.boot[i,]
    X.para[i,] <- bioacc(para, C_water, tc, tsim)$conc
    sd <- exp(para[3])
    X.total[i,] <- rnorm(length(tsim), mean=X.para[i,], sd=sd)
}

intervals.para <- compute.intervals(X.para, tsim)
intervals.total <- compute.intervals(X.total, tsim)

ggplot() +
    geom_ribbon(data=intervals.total, aes(x=time, y=p50, ymin=p05, ymax=p95), fill = gray(0.75)) +
    geom_ribbon(data=intervals.para, aes(x=time, y=p50, ymin=p05, ymax=p95), fill = gray(0.6)) +
    geom_line(data = simu.mean, aes(time, conc), col = "orange", linewidth = 1.5) +
    xlab("Time (hours)") +
    ylab("Internal concentration") +
    geom_vline(xintercept = 1, linetype="dashed") +
    geom_point(data = df, aes(time, Cinternal)) +
    ggtitle("Bootstrapped 90% confidence and prediction intervals with normal errors")

Gamma distributed errors

Again, we repeat the same procedure for the likelihood function based on the gamma distribution.

# run non-parametric bootstrap

M <- 3000
thetas.boot.g <- matrix(NA, ncol=3, nrow=M)
colnames(thetas.boot.g) <- c("k.u", "k.e", "log.sd")
for(i in 1:M){
    idx <- sample(1:nrow(df), nrow(df), replace=TRUE)
    data.boot <- df[idx,]
    data.boot <- data.boot[order(data.boot$time),]
    fit <- maxLik::maxNM(log.ll.gamma,
                         start = parameters.P2.g,
                         data = data.boot, LOQ = LOQ,
                         parscale = c(100, 1, 5))

    if(fit$code <= 2){ #check for convergence
        thetas.boot.g[i,] <- coef(fit)
    }
}

pairs(thetas.boot.g, col=gray(0, alpha=0.1))

## propagate parameters through models
X.para.g <- X.total.g <- matrix(NA, ncol=length(tsim), nrow=M)
for(i in 1:M){
    para <- thetas.boot.g[i,]
    X.para.g[i,] <- mode <- bioacc(para, C_water, tc, tsim)$conc
    sd <- exp(para[3])
    scale <- 0.5*(sqrt(4*sd^2+mode^2) - mode)
    shape <- mode/scale + 1
    X.total.g[i,] <- rgamma(length(tsim), scale=scale, shape=shape)
}

intervals.para.g <- compute.intervals(X.para.g, tsim)
intervals.total.g <- compute.intervals(X.total.g, tsim)

ggplot() +
    geom_ribbon(data=intervals.total.g, aes(x=time, y=p50, ymin=p05, ymax=p95), fill = gray(0.75)) +
    geom_ribbon(data=intervals.para.g, aes(x=time, y=p50, ymin=p05, ymax=p95), fill = gray(0.6)) +
    geom_line(data = simu.mean.g, aes(time, conc), col = "orange", linewidth = 1.5) +
    xlab("Time (hours)") +
    ylab("Internal concentration") +
    geom_vline(xintercept = 1, linetype="dashed") +
    geom_point(data = df, aes(time, Cinternal)) +
    ggtitle("Bootstrapped 90% confidence and prediction intervals with gamma errors")

Perspective 3: Bayesian Inference

We incorporate prior knowledge into the model using a Bayesian approach, which allows us to sample from the posterior distribution of the parameters and estimate the uncertainty in our predictions.

Prior Definition

As in Perspective 1, the priors for the parameters are based on Hendriks et al. (2001).

log.prior <- function(theta) {
    k.u <- 112.798
    k.e <- 0.00296

    meanlog.k.u = log10(k.u) / log10(exp(1))
    meanlog.k.e = log10(k.e) / log10(exp(1))
    sdlog = log10(10) / log10(exp(1))

    dlnorm(theta[1], meanlog.k.u, sdlog, log=TRUE) +
        dlnorm(theta[2], meanlog.k.e, sdlog, log=TRUE) +
        dnorm(theta[3], 2, 2, log=TRUE)
}

Posterior Distribution

We define functions proportional to the posterior distribution, incorporating both likelihood and prior information.

log.post.normal <- function(theta, data, ...) {
    log.ll.normal(theta, data, ...) + log.prior(theta)
}

log.post.gamma <- function(theta, data, ...) {
    log.ll.gamma(theta, data, ...) + log.prior(theta)
}

MCMC Sampling

We sample from the posterior distribution using a Markov chain Monte Carlo (MCMC) method. This helps us quantify the uncertainty in the model predictions.

pp.normal <- MCMC(log.post.normal, n=15000,
                  init = c(k.u=90, k.e=1, log.sd=2),
                  scale = c(100, 1, 1), acc.rate=0.2,
                  showProgressBar = FALSE,
                  data=df, LOQ = LOQ)
##   generate 15000 samples
post.normal <- convert.to.coda(pp.normal)

plot(post.normal)                              # whole chain

pairs(pp.normal$samples[5000:15000,], col=gray(0, alpha=0.02))

pp.gamma <- MCMC(log.post.gamma, n=15000,
                 init = c(k.u=90, k.e=1, log.sd=2),
                 scale = c(100, 1, 1),
                 showProgressBar = FALSE,
                 data=df, acc.rate=0.2)
##   generate 15000 samples
post.gamma <- convert.to.coda(pp.gamma)

plot(post.gamma)                              # whole chain

pairs(pp.gamma$samples[5000:15000,], col=gray(0, alpha=0.02))

Credible and Posterior Prediction Intervals

Finally, we plot the 90% credible intervals for normal and gamma errors, incorporating both parameter and total uncertainty.

n <- 10000
X.para <- X.total <- matrix(NA, ncol=length(tsim), nrow=n)
for(i in 1:n){
    idx <- sample(5000:15000, 1) # avoid the first 5000 burn-in samples
    para <- pp.normal$samples[idx,]
    X.para[i,] <- bioacc(para, C_water, tc, tsim)$conc
    sd <- exp(para[3])
    X.total[i,] <- rnorm(length(tsim), mean=X.para[i,], sd=sd)
}

intervals.para <- compute.intervals(X.para, tsim)
intervals.total <- compute.intervals(X.total, tsim)

ggplot(intervals.para) +
    geom_line(aes(x=time, y=p50), col = "orange", linewidth = 1.5) +
    geom_ribbon(aes(x=time, y=p50, ymin=p05, ymax=p95), alpha = 0.2) +
    geom_ribbon(data=intervals.total, aes(x=time, y=p50, ymin=p05, ymax=p95), alpha = 0.2) +
    xlab("Time (hours)") +
    ylab("Internal concentration") +
    geom_vline(xintercept = 1, linetype="dashed") +
    geom_point(data = df, aes(time, Cinternal)) +
    ggtitle("Bayesian 90% parameter and total uncertainty interval with normal errors")

n <- 10000
X.para <- X.total <- matrix(NA, ncol=length(tsim), nrow=n)
for(i in 1:n){
    idx <- sample(5000:15000, 1) # avoid the first 5000 burn-in samples
    para <- pp.gamma$samples[idx,]
    X.para[i,] <- mode <- bioacc(para, C_water, tc, tsim)$conc
    sd <- exp(para[3])
    scale <- 0.5*(sqrt(4*sd^2+mode^2) - mode)
    shape <- mode/scale + 1
    X.total[i,] <- rgamma(length(tsim), scale=scale, shape=shape)
}

intervals.para <- compute.intervals(X.para, tsim)
intervals.total <- compute.intervals(X.total, tsim)

ggplot(intervals.para) +
    geom_line(aes(x=time, y=p50), col = "orange", linewidth = 1.5) +
    geom_ribbon(aes(x=time, y=p50, ymin=p05, ymax=p95), alpha = 0.2) +
    geom_ribbon(data=intervals.total, aes(x=time, y=p50, ymin=p05, ymax=p95), alpha = 0.2) +
    xlab("Time (hours)") +
    ylab("Internal concentration") +
    geom_vline(xintercept = 1, linetype="dashed") +
    geom_point(data = df, aes(time, Cinternal)) +
    ggtitle("Bayesian 90% parameter and total uncertainty interval with gamma errors")

Compare Parameters

For Perspective 3, we can see how much we have learned from the data by comparing the prior and the posterior distributions. Note that, strictly speaking, the bootstrap distribution cannot be compared to the prior. The prior represents (inter-)subjective prior knowledge, whereas the bootstrap reflects the distribution of the frequentist estimator of the parameter.

## Define density of prior distributions for the parameters separately
k.u <- 112.798
meanlog.k.u <- log10(k.u) / log10(exp(1))

k.e <- 0.00296
meanlog.k.e <- log10(k.e) / log10(exp(1))

sdlog <- log10(10) / log10(exp(1))

d.ku <- function(theta) dlnorm(theta, meanlog.k.u, sdlog, log=F)
d.ke <- function(theta) dlnorm(theta, meanlog.k.e, sdlog, log=F)

Normal Errors

par(mfrow=c(2,2))

ku.plt <- seq(0, 300, 1)
ke.plt <- seq(0, 3, 0.01)

## --- bootstrapping
hist(thetas.boot[,"k.u"], xlim=c(0, 300),
     xlab="k.u", main = "Prior and bootstrap distribution", probability = TRUE, breaks=25)
lines(ku, d.ku(ku), type="l", col="red", lwd=2)

hist(thetas.boot[,"k.e"],  xlim=c(0, 3),
     xlab="k.e", main = "Prior and bootstrap distribution", probability = TRUE, breaks=25)
lines(ke, d.ke(ke), type="l", col="red", lwd=2)


## --- Bayesian
hist(pp.normal$samples[5000:15000,"k.u"], xlim=c(0, 300),
     xlab="k.u", main = "Prior and Bayesian posterior", probability = TRUE, breaks=25)
lines(ku.plt, d.ku(ku.plt), type="l", col="red", lwd=2)

hist(pp.normal$samples[5000:15000,"k.e"],  xlim=c(0, 3),
     xlab="k.e", main = "Prior and Bayesian posterior", probability = TRUE, breaks=25)
lines(ke.plt, d.ke(ke.plt), type="l", col="red", lwd=2)

Gamma Errors

par(mfrow=c(2,2))

ku.plt <- seq(0, 300, 1)
ke.plt <- seq(0, 3, 0.01)

## --- bootstrapping
hist(thetas.boot.g[,"k.u"], xlim=c(0, 300),
     xlab="k.u", main = "Prior and bootstrap distribution", probability = TRUE, breaks=25)
lines(ku, d.ku(ku), type="l", col="red", lwd=2)

hist(thetas.boot.g[,"k.e"],  xlim=c(0, 3),
     xlab="k.e", main = "Prior and bootstrap distribution", probability = TRUE, breaks=25)
lines(ke, d.ke(ke), type="l", col="red", lwd=2)


## --- bayes
hist(pp.gamma$samples[5000:15000,"k.u"], xlim=c(0, 300),
     xlab="k.u", main = "Prior and Bayesian posterior", probability = TRUE, breaks=25)
lines(ku.plt, d.ku(ku.plt), type="l", col="red", lwd=2)

hist(pp.gamma$samples[5000:15000,"k.e"],  xlim=c(0, 3),
     xlab="k.e", main = "Prior and Bayesian posterior", probability = TRUE, breaks=25)
lines(ke.plt, d.ke(ke.plt), type="l", col="red", lwd=2)